To run this code in my project using the renv environment, run the following lines of code
install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile
packages <- c("genefilter", "DESeq2", "apeglm", "ashr", "ggplot2", "vsn",
"hexbin", "pheatmap", "RColorBrewer", "EnhancedVolcano", "tidyverse")
lapply(packages, library, character.only = TRUE)
## Warning: package 'DESeq2' was built under R version 4.3.3
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
## Warning: package 'matrixStats' was built under R version 4.3.3
## Warning: package 'hexbin' was built under R version 4.3.3
## Warning: package 'ggrepel' was built under R version 4.3.3
## [[1]]
## [1] "genefilter" "stats" "graphics" "grDevices" "datasets"
## [6] "utils" "methods" "base"
##
## [[2]]
## [1] "DESeq2" "SummarizedExperiment" "Biobase"
## [4] "MatrixGenerics" "matrixStats" "GenomicRanges"
## [7] "GenomeInfoDb" "IRanges" "S4Vectors"
## [10] "BiocGenerics" "stats4" "genefilter"
## [13] "stats" "graphics" "grDevices"
## [16] "datasets" "utils" "methods"
## [19] "base"
##
## [[3]]
## [1] "apeglm" "DESeq2" "SummarizedExperiment"
## [4] "Biobase" "MatrixGenerics" "matrixStats"
## [7] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [10] "S4Vectors" "BiocGenerics" "stats4"
## [13] "genefilter" "stats" "graphics"
## [16] "grDevices" "datasets" "utils"
## [19] "methods" "base"
##
## [[4]]
## [1] "ashr" "apeglm" "DESeq2"
## [4] "SummarizedExperiment" "Biobase" "MatrixGenerics"
## [7] "matrixStats" "GenomicRanges" "GenomeInfoDb"
## [10] "IRanges" "S4Vectors" "BiocGenerics"
## [13] "stats4" "genefilter" "stats"
## [16] "graphics" "grDevices" "datasets"
## [19] "utils" "methods" "base"
##
## [[5]]
## [1] "ggplot2" "ashr" "apeglm"
## [4] "DESeq2" "SummarizedExperiment" "Biobase"
## [7] "MatrixGenerics" "matrixStats" "GenomicRanges"
## [10] "GenomeInfoDb" "IRanges" "S4Vectors"
## [13] "BiocGenerics" "stats4" "genefilter"
## [16] "stats" "graphics" "grDevices"
## [19] "datasets" "utils" "methods"
## [22] "base"
##
## [[6]]
## [1] "vsn" "ggplot2" "ashr"
## [4] "apeglm" "DESeq2" "SummarizedExperiment"
## [7] "Biobase" "MatrixGenerics" "matrixStats"
## [10] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [13] "S4Vectors" "BiocGenerics" "stats4"
## [16] "genefilter" "stats" "graphics"
## [19] "grDevices" "datasets" "utils"
## [22] "methods" "base"
##
## [[7]]
## [1] "hexbin" "vsn" "ggplot2"
## [4] "ashr" "apeglm" "DESeq2"
## [7] "SummarizedExperiment" "Biobase" "MatrixGenerics"
## [10] "matrixStats" "GenomicRanges" "GenomeInfoDb"
## [13] "IRanges" "S4Vectors" "BiocGenerics"
## [16] "stats4" "genefilter" "stats"
## [19] "graphics" "grDevices" "datasets"
## [22] "utils" "methods" "base"
##
## [[8]]
## [1] "pheatmap" "hexbin" "vsn"
## [4] "ggplot2" "ashr" "apeglm"
## [7] "DESeq2" "SummarizedExperiment" "Biobase"
## [10] "MatrixGenerics" "matrixStats" "GenomicRanges"
## [13] "GenomeInfoDb" "IRanges" "S4Vectors"
## [16] "BiocGenerics" "stats4" "genefilter"
## [19] "stats" "graphics" "grDevices"
## [22] "datasets" "utils" "methods"
## [25] "base"
##
## [[9]]
## [1] "RColorBrewer" "pheatmap" "hexbin"
## [4] "vsn" "ggplot2" "ashr"
## [7] "apeglm" "DESeq2" "SummarizedExperiment"
## [10] "Biobase" "MatrixGenerics" "matrixStats"
## [13] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [16] "S4Vectors" "BiocGenerics" "stats4"
## [19] "genefilter" "stats" "graphics"
## [22] "grDevices" "datasets" "utils"
## [25] "methods" "base"
##
## [[10]]
## [1] "EnhancedVolcano" "ggrepel" "RColorBrewer"
## [4] "pheatmap" "hexbin" "vsn"
## [7] "ggplot2" "ashr" "apeglm"
## [10] "DESeq2" "SummarizedExperiment" "Biobase"
## [13] "MatrixGenerics" "matrixStats" "GenomicRanges"
## [16] "GenomeInfoDb" "IRanges" "S4Vectors"
## [19] "BiocGenerics" "stats4" "genefilter"
## [22] "stats" "graphics" "grDevices"
## [25] "datasets" "utils" "methods"
## [28] "base"
##
## [[11]]
## [1] "lubridate" "forcats" "stringr"
## [4] "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "tidyverse"
## [10] "EnhancedVolcano" "ggrepel" "RColorBrewer"
## [13] "pheatmap" "hexbin" "vsn"
## [16] "ggplot2" "ashr" "apeglm"
## [19] "DESeq2" "SummarizedExperiment" "Biobase"
## [22] "MatrixGenerics" "matrixStats" "GenomicRanges"
## [25] "GenomeInfoDb" "IRanges" "S4Vectors"
## [28] "BiocGenerics" "stats4" "genefilter"
## [31] "stats" "graphics" "grDevices"
## [34] "datasets" "utils" "methods"
## [37] "base"
sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.9.3 forcats_1.0.0
## [3] stringr_1.5.1 dplyr_1.1.4
## [5] purrr_1.0.2 readr_2.1.5
## [7] tidyr_1.3.1 tibble_3.2.1
## [9] tidyverse_2.0.0 EnhancedVolcano_1.20.0
## [11] ggrepel_0.9.6 RColorBrewer_1.1-3
## [13] pheatmap_1.0.12 hexbin_1.28.4
## [15] vsn_3.70.0 ggplot2_3.5.1
## [17] ashr_2.2-63 apeglm_1.24.0
## [19] DESeq2_1.42.1 SummarizedExperiment_1.32.0
## [21] Biobase_2.62.0 MatrixGenerics_1.14.0
## [23] matrixStats_1.4.1 GenomicRanges_1.54.1
## [25] GenomeInfoDb_1.38.8 IRanges_2.36.0
## [27] S4Vectors_0.40.2 BiocGenerics_0.48.1
## [29] genefilter_1.84.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 bitops_1.0-9 rlang_1.1.4
## [4] magrittr_2.0.3 compiler_4.3.2 RSQLite_2.3.7
## [7] png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
## [10] crayon_1.5.3 fastmap_1.2.0 XVector_0.42.0
## [13] utf8_1.2.4 rmarkdown_2.28 tzdb_0.4.0
## [16] preprocessCore_1.64.0 bit_4.5.0 xfun_0.48
## [19] zlibbioc_1.48.2 cachem_1.1.0 jsonlite_1.8.9
## [22] blob_1.2.4 DelayedArray_0.28.0 BiocParallel_1.36.0
## [25] irlba_2.3.5.1 parallel_4.3.2 R6_2.5.1
## [28] stringi_1.8.4 bslib_0.8.0 limma_3.58.1
## [31] SQUAREM_2021.1 jquerylib_0.1.4 numDeriv_2016.8-1.1
## [34] Rcpp_1.0.13 knitr_1.48 timechange_0.3.0
## [37] Matrix_1.6-5 splines_4.3.2 tidyselect_1.2.1
## [40] rstudioapi_0.17.0 abind_1.4-8 yaml_2.3.10
## [43] codetools_0.2-20 affy_1.80.0 lattice_0.22-6
## [46] plyr_1.8.9 withr_3.0.1 KEGGREST_1.42.0
## [49] coda_0.19-4.1 evaluate_1.0.1 survival_3.7-0
## [52] Biostrings_2.70.3 pillar_1.9.0 affyio_1.72.0
## [55] BiocManager_1.30.25 renv_1.0.11 generics_0.1.3
## [58] invgamma_1.1 RCurl_1.98-1.16 truncnorm_1.0-9
## [61] emdbook_1.3.13 hms_1.1.3 munsell_0.5.1
## [64] scales_1.3.0 xtable_1.8-4 glue_1.8.0
## [67] tools_4.3.2 annotate_1.80.0 locfit_1.5-9.10
## [70] mvtnorm_1.3-1 XML_3.99-0.17 grid_4.3.2
## [73] bbmle_1.0.25.1 bdsmatrix_1.3-7 AnnotationDbi_1.64.1
## [76] colorspace_2.1-1 GenomeInfoDbData_1.2.11 cli_3.6.3
## [79] fansi_1.0.6 mixsqp_0.3-54 S4Arrays_1.2.1
## [82] gtable_0.3.5 sass_0.4.9 digest_0.6.37
## [85] SparseArray_1.2.4 memoise_2.0.1 htmltools_0.5.8.1
## [88] lifecycle_1.0.4 httr_1.4.7 statmod_1.5.0
## [91] bit64_4.5.2 MASS_7.3-60.0.1
save_ggplot <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300) {
# Display plot
print(plot)
# Save plot
ggsave(filename = paste0(filename, ".png"), plot = plot, width = width, height = height, units = units, dpi = dpi)
}
Read in raw count data
counts_raw <- read.csv("../output_RNA/stringtie-GeneExt/LCM_RNA_gene_count_matrix.csv", row.names = 1) #load in data
gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9
Read in metadata
meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv") %>%
dplyr::arrange(Sample) %>%
mutate(across(c(Tissue, Fragment, Section_Date, LCM_Date), factor)) # Set variables as factors
meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline
Data sanity checks!
all(meta$Sample %in% colnames(counts_raw)) #are all of the sample names in the metadata column names in the gene count matrix? Should be TRUE
## [1] TRUE
all(meta$Sample == colnames(counts_raw)) #are they the same in the same order? Should be TRUE
## [1] TRUE
ffun<-filterfun(pOverA(0.5,10)) #Keep genes expressed in at least 50% of samples -
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter
filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter
cat("Number of genes after filtering:", sum(counts_filt_poa))
## Number of genes after filtering: 14464
write.csv(filtered_counts, "../output_RNA/differential_expression/filtered_counts.csv")
There are now 14464 genes in the filtered dataset.
Data sanity checks:
all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts)) #are they the same in the same order? Should be TRUE
## [1] TRUE
Create DESeq object and run DESeq2
dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
colData = meta,
design= ~ Fragment + Tissue)
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res <- resLFC
resOrdered <- res[order(res$pvalue),]# save differentially expressed genes
DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi
nrow(DE_05)
## [1] 3606
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 804
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 2802
write.csv(as.data.frame(resOrdered),
file="../output_RNA/differential_expression/DESeq_results.csv")
write.csv(DE_05,
file="../output_RNA/differential_expression/DEG_05.csv")
EnhancedVolcano(resLFC,
lab = rownames(resLFC),
x = 'log2FoldChange',
y = 'pvalue')
plotMA(results(dds, contrast = c("Tissue","Aboral","OralEpi")), ylim=c(-20,20))
plotMA(resLFC, ylim=c(-20,20))
# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef=6, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=6, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-20,20)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")
plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")
Transforming count data for visualization
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)
meanSdPlot(assay(vsd), main = "vsd")
## Warning in geom_hex(bins = bins, ...): Ignoring unknown parameters: `main`
meanSdPlot(assay(rld))
meanSdPlot(assay(ntd))
Will move forward with vst transformation for visualizations
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
#view most significantly differentially expressed genes
select <- order(res$padj)[1:20]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
geom_point(size=2) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + theme_bw()
save_ggplot(PCA, "../output_RNA/differential_expression/PCA")
PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
geom_point(size=2) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + theme_bw()
ggsave(filename = paste0("../output_RNA/differential_expression/PCA_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)
Clearly, the majority of the variance in the data is explained by tissue type!
Download annotation files from genome website
# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz
# move to references direcotry
mv *gz ../references
# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)
CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)
KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
## Joining with `by = join_by(query)`
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())
## Joining with `by = join_by(query)`
annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)
## Joining with `by = join_by(query)`
eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_eggnog),
file="../output_RNA/differential_expression/DE_05_eggnog_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12723 rows [1, 2, 3, 4,
## 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 19, 20, 21, 23, 24, 25, ...].
#view most significantly differentially expressed genes
select <- order(res$padj)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE")
#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral")
#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi")
MarkerGenes <- read.csv("../references/Pacuta_MarkerGenes_Levy2021.csv") %>% dplyr::rename("query" = 1, "List" = 2, "definition" = 3)
MarkerGenes$def_short <- ifelse(nchar(MarkerGenes$definition) > 20,
paste0(substr(MarkerGenes$definition, 1, 17), "..."),
MarkerGenes$definition)
Biomin <- read.csv("../references/Pacuta_Biomin.csv") %>% dplyr::rename("query" = Pocillopora_acuta_best_hit) %>% select(-c(accessionnumber.geneID, Ref))
Biomin <- Biomin %>%
group_by(query,List) %>%
summarize(definition = paste(unique(definition), collapse = ","))
## `summarise()` has grouped output by 'query'. You can override using the
## `.groups` argument.
Biomin$def_short <- ifelse(nchar(Biomin$definition) > 40,
paste0(substr(Biomin$definition, 1, 37), "..."),
Biomin$definition)
DE_05$query <- rownames(DE_05)
DE_05_biomin <- DE_05 %>% left_join(Biomin) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_marker <- DE_05 %>% left_join(MarkerGenes) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_marker),
file="../output_RNA/differential_expression/DE_05_markergene_annotation.csv")
biomin_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin)
## Joining with `by = join_by(query)`
biomin_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(Biomin)
## Joining with `by = join_by(query)`
markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes)
## Joining with `by = join_by(query)`
markers_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(MarkerGenes)
## Joining with `by = join_by(query)`
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
#view biomin genes that are differntially expressed
z_scores <- t(scale(t(assay(vsd)[DE_05_biomin$query, ]), center = TRUE, scale = TRUE))
DE_biomin <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row = DE_05_biomin$def_short, fontsize_row = 5)
DE_biomin
save_ggplot(DE_biomin, "../output_RNA/differential_expression/DE_biomin")
#view marker genes that are differntially expressed
z_scores <- t(scale(t(assay(vsd)[DE_05_marker$query, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=df,
labels_row = DE_05_marker$List, fontsize_row = 4)
DE_marker
save_ggplot(DE_marker, "../output_RNA/differential_expression/DE_marker")
DE_05_marker_grouped <- DE_05_marker %>% arrange(List) %>% mutate(List = as.factor(List))
z_scores <- t(scale(t(assay(vsd)[DE_05_marker_grouped$quer, ]), center = TRUE, scale = TRUE))
DE_05_marker_grouped_plot <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row = DE_05_marker_grouped$List, fontsize_row = 5)
DE_05_marker_grouped_plot
save_ggplot(DE_05_marker_grouped_plot, "../output_RNA/differential_expression/DE_05_marker_grouped")
Biomin_volcano <- EnhancedVolcano(biomin_all_res,
lab = biomin_all_res$def_short,
x = 'log2FoldChange',
y = 'padj',
pCutoff = 0.01,
drawConnectors = TRUE,
widthConnectors = 0.75,
pointSize = 1,
labSize = 2,boxedLabels = TRUE,max.overlaps = 40)
save_ggplot(Biomin_volcano, "../output_RNA/differential_expression/Biomin_volcano")
Marker_volcano <- EnhancedVolcano(markers_all_res,
lab = markers_all_res$List,
x = 'log2FoldChange',
y = 'padj',
pCutoff = 0.01,
drawConnectors = TRUE,
widthConnectors = 0.75,
pointSize = 1,
labSize = 2,boxedLabels = TRUE,max.overlaps = 60)
save_ggplot(Marker_volcano, "../output_RNA/differential_expression/Marker_volcano")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Marker_volcano_names <- EnhancedVolcano(markers_all_res,
lab = markers_all_res$def_short,
x = 'log2FoldChange',
y = 'padj',
pCutoff = 0.01,
drawConnectors = TRUE,
widthConnectors = 0.75,
pointSize = 1,
labSize = 2,boxedLabels = TRUE,max.overlaps = 60)
save_ggplot(Marker_volcano_names, "../output_RNA/differential_expression/Marker_volcano_names")
# wget protein sequence reference file
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz
# move to references direcotry
mv *gz ../references
# unzip files
gunzip ../references/*gz
#get the names of all the DEGs from the first column of the DEG csv file
tail -n +2 ../output_RNA/differential_expression/DEG_05.csv | cut -d',' -f1 | tr -d '"' > ../output_RNA/differential_expression/DEG_05_names.csv
#grep this file against the protein fasta file, first with wc -l to make sure the number of lines is correct (should be your number of DEGs)
grep -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa | wc -l
#grep each header with the protein sequence after ("-A 1") and save to a new file
grep -A 1 -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa > ../output_RNA/differential_expression/DEG_05_seqs.txt
On andromeda:
Blastp-ing only the DE genes against the entire nr database (will take a while)
cd ../scripts
nano DEG_05_blast.sh
#!/bin/bash
#SBATCH --job-name="DE_blast"
#SBATCH -t 240:00:00
#SBATCH --export=NONE
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --mail-user=zdellaert@uri.edu #your email to send notifications
#SBATCH --mem=500GB
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --account=putnamlab
#SBATCH --nodes=2 --ntasks-per-node=24
module load BLAST+/2.15.0-gompi-2023a
cd ../output_RNA/differential_expression #set working directory
mkdir blast
cd blast
#nr database location andromeda: /data/shared/ncbi-db/.ncbirc
# points to current location: cat /data/shared/ncbi-db/.ncbirc
# [BLAST]
# BLASTDB=/data/shared/ncbi-db/2024-11-10
blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results.txt -outfmt 0 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 10
blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results_tab.txt -outfmt 6 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1
echo "Blast complete!" $(date)
sbatch DEG_05_blast.sh
On unity:
swissprot based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd
Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/
mkdir ../references/blast_dbs
cd ../references/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_10_02.fasta.gz
gunzip -k uniprot_sprot_r2024_10_02.fasta.gz
rm uniprot_sprot_r2024_10_02.fasta.gz
head uniprot_sprot_r2024_10_02.fasta
echo "Number of Sequences"
grep -c ">" uniprot_sprot_r2024_10_02.fasta
# 572214 sequences
module load blast-plus/2.14.1
makeblastdb \
-in ../references/blast_dbs/uniprot_sprot_r2024_10_02.fasta \
-dbtype prot \
-out ../references/blast_dbs/uniprot_sprot_r2024_10_02
cd ../scripts
nano blastp_SwissProt.sh
#!/bin/bash
#SBATCH -t 18:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --mem=500GB
#SBATCH --export=NONE
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH -D /project/pi_hputnam_uri_edu/zdellaert/LaserCoral #set working directory
module load blast-plus/2.14.1
cd references/
mkdir annotation
fasta="Pocillopora_acuta_HIv2.genes.pep.faa"
blastp \
-query $fasta \
-db blast_dbs/uniprot_sprot_r2024_10_02 \
-out annotation/blastp_SwissProt_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
echo "Blast complete!" $(date)
cd references/annotation/
tr '|' '\t' < blastp_SwissProt_out.tab > blastp_SwissProt_out_sep.tab
cd ../references/annotation/
curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo%2Cgo_id%2Cgo_c%2Cgo_f&format=tsv&query=%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_111524.tsv
wc -l SwissProt-Annot-GO_111524.tsv
#572215
All code below based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd
Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/
bltabl <- read.csv("../references/annotation/blastp_SwissProt_out_sep.tab", sep = '\t', header = FALSE)
spgo <- read.csv("../references/annotation/SwissProt-Annot-GO_111524.tsv", sep = '\t', header = TRUE)
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(
query = V1,
blast_hit = V3,
evalue = V13,
ProteinNames = Protein.names,
BiologicalProcess = Gene.Ontology..biological.process.,
GeneOntologyIDs = Gene.Ontology.IDs
)
head(annot_tab)
## query blast_hit evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a Q4JAI4 1.02e-37
## 2 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 O08807 9.62e-116
## 3 Pocillopora_acuta_HIv2___RNAseq.g7985.t1 O74212 3.56e-158
## 4 Pocillopora_acuta_HIv2___TS.g15308.t1 Q09575 1.08e-12
## 5 Pocillopora_acuta_HIv2___RNAseq.g2057.t1 P0C1P0 8.81e-14
## 6 Pocillopora_acuta_HIv2___RNAseq.g4696.t1 Q9W2Q5 8.98e-69
## ProteinNames
## 1 Methionine synthase (EC 2.1.1.-) (Homocysteine methyltransferase)
## 2 Peroxiredoxin-4 (EC 1.11.1.24) (Antioxidant enzyme AOE372) (Peroxiredoxin IV) (Prx-IV) (Thioredoxin peroxidase AO372) (Thioredoxin-dependent peroxide reductase A0372) (Thioredoxin-dependent peroxiredoxin 4)
## 3 Acyl-lipid (8-3)-desaturase (EC 1.14.19.30) (Delta(5) fatty acid desaturase) (Delta-5 fatty acid desaturase)
## 4 Uncharacterized protein K02A2.6
## 5 Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y (Phosphatidylinositol-glycan biosynthesis class Y protein) (PIG-Y)
## 6 Calcium and integrin-binding family member 2
## BiologicalProcess
## 1 methionine biosynthetic process [GO:0009086]; methylation [GO:0032259]
## 2 cell redox homeostasis [GO:0045454]; extracellular matrix organization [GO:0030198]; hydrogen peroxide catabolic process [GO:0042744]; male gonad development [GO:0008584]; negative regulation of male germ cell proliferation [GO:2000255]; protein maturation by protein folding [GO:0022417]; reactive oxygen species metabolic process [GO:0072593]; response to oxidative stress [GO:0006979]; spermatogenesis [GO:0007283]
## 3 long-chain fatty acid biosynthetic process [GO:0042759]; unsaturated fatty acid biosynthetic process [GO:0006636]
## 4 DNA integration [GO:0015074]
## 5 GPI anchor biosynthetic process [GO:0006506]
## 6 calcium ion homeostasis [GO:0055074]; phototransduction [GO:0007602]
## GeneOntologyIDs
## 1 GO:0003871; GO:0008270; GO:0009086; GO:0032259
## 2 GO:0005615; GO:0005737; GO:0005739; GO:0005783; GO:0005790; GO:0005829; GO:0006979; GO:0007283; GO:0008379; GO:0008584; GO:0022417; GO:0030198; GO:0042744; GO:0042802; GO:0045454; GO:0072593; GO:0140313; GO:2000255
## 3 GO:0006636; GO:0016020; GO:0020037; GO:0042759; GO:0046872; GO:0102866
## 4 GO:0003676; GO:0005737; GO:0008270; GO:0015074; GO:0019899; GO:0042575
## 5 GO:0000506; GO:0006506
## 6 GO:0000287; GO:0005509; GO:0005737; GO:0007602; GO:0055074
write.table(annot_tab,
file = "../references/annotation/protein-GO.tsv",
sep = "\t",
row.names = FALSE,
quote = FALSE)
DE_05_SwissProt <- DE_05 %>% left_join(annot_tab) %>% select(query,everything())
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_SwissProt),
file="../output_RNA/differential_expression/DE_05_SwissProt_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 30,
paste0(substr(DE_05_SwissProt$ProteinNames, 1, 27), "..."),
DE_05_SwissProt$ProteinNames)
gene_labels <- DE_05_SwissProt %>%
select(query,short_name) %>%
mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes
#view most significantly differentially expressed genes
select <- order(res$padj)[1:50]
rownames(res)[select]
## [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"
## [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
## [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1"
## [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"
## [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"
## [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3"
## [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1"
## [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1"
## [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1"
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1"
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1"
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1"
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1"
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1"
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1"
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1"
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1"
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1"
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1"
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1"
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1"
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1"
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1"
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1"
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1"
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1"
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1"
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1"
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2"
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1"
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE_SwissProt")
#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral_SwissProt")
#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi_SwissProt")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
DE_05_SwissProt$short_GO <- ifelse(nchar(DE_05_SwissProt$BiologicalProcess) > 30,
paste0(substr(DE_05_SwissProt$BiologicalProcess, 1, 27), "..."),
DE_05_SwissProt$BiologicalProcess)
gene_labels <- DE_05_SwissProt %>% select(query,short_GO) %>%
mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes
#view most significantly differentially expressed genes
select <- order(res$padj)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE_Blast2GO")
#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral_Blast2GO")
#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi_Blast2GO")
After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.”